In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
In [2]:
import pandas as pd
df = pd.read_csv("weather.csv")
df.head()
Out[2]:
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | WindSpeed9am | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | NW | 30.0 | SW | NW | 6.0 | ... | 29 | 1019.7 | 1015.0 | 7 | 7 | 14.4 | 23.6 | No | 3.6 | Yes |
| 1 | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | ENE | 39.0 | E | W | 4.0 | ... | 36 | 1012.4 | 1008.4 | 5 | 3 | 17.5 | 25.7 | Yes | 3.6 | Yes |
| 2 | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | NW | 85.0 | N | NNE | 6.0 | ... | 69 | 1009.5 | 1007.2 | 8 | 7 | 15.4 | 20.2 | Yes | 39.8 | Yes |
| 3 | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | NW | 54.0 | WNW | W | 30.0 | ... | 56 | 1005.5 | 1007.0 | 2 | 7 | 13.5 | 14.1 | Yes | 2.8 | Yes |
| 4 | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | SSE | 50.0 | SSE | ESE | 20.0 | ... | 49 | 1018.3 | 1018.5 | 7 | 7 | 11.1 | 15.4 | Yes | 0.0 | No |
5 rows × 22 columns
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 366 entries, 0 to 365 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MinTemp 366 non-null float64 1 MaxTemp 366 non-null float64 2 Rainfall 366 non-null float64 3 Evaporation 366 non-null float64 4 Sunshine 363 non-null float64 5 WindGustDir 363 non-null object 6 WindGustSpeed 364 non-null float64 7 WindDir9am 335 non-null object 8 WindDir3pm 365 non-null object 9 WindSpeed9am 359 non-null float64 10 WindSpeed3pm 366 non-null int64 11 Humidity9am 366 non-null int64 12 Humidity3pm 366 non-null int64 13 Pressure9am 366 non-null float64 14 Pressure3pm 366 non-null float64 15 Cloud9am 366 non-null int64 16 Cloud3pm 366 non-null int64 17 Temp9am 366 non-null float64 18 Temp3pm 366 non-null float64 19 RainToday 366 non-null object 20 RISK_MM 366 non-null float64 21 RainTomorrow 366 non-null object dtypes: float64(12), int64(5), object(5) memory usage: 63.0+ KB
In [4]:
df.describe()
Out[4]:
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RISK_MM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 363.000000 | 364.000000 | 359.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 | 366.000000 |
| mean | 7.265574 | 20.550273 | 1.428415 | 4.521858 | 7.909366 | 39.840659 | 9.651811 | 17.986339 | 72.035519 | 44.519126 | 1019.709016 | 1016.810383 | 3.890710 | 4.024590 | 12.358470 | 19.230874 | 1.428415 |
| std | 6.025800 | 6.690516 | 4.225800 | 2.669383 | 3.481517 | 13.059807 | 7.951929 | 8.856997 | 13.137058 | 16.850947 | 6.686212 | 6.469422 | 2.956131 | 2.666268 | 5.630832 | 6.640346 | 4.225800 |
| min | -5.300000 | 7.600000 | 0.000000 | 0.200000 | 0.000000 | 13.000000 | 0.000000 | 0.000000 | 36.000000 | 13.000000 | 996.500000 | 996.800000 | 0.000000 | 0.000000 | 0.100000 | 5.100000 | 0.000000 |
| 25% | 2.300000 | 15.025000 | 0.000000 | 2.200000 | 5.950000 | 31.000000 | 6.000000 | 11.000000 | 64.000000 | 32.250000 | 1015.350000 | 1012.800000 | 1.000000 | 1.000000 | 7.625000 | 14.150000 | 0.000000 |
| 50% | 7.450000 | 19.650000 | 0.000000 | 4.200000 | 8.600000 | 39.000000 | 7.000000 | 17.000000 | 72.000000 | 43.000000 | 1020.150000 | 1017.400000 | 3.500000 | 4.000000 | 12.550000 | 18.550000 | 0.000000 |
| 75% | 12.500000 | 25.500000 | 0.200000 | 6.400000 | 10.500000 | 46.000000 | 13.000000 | 24.000000 | 81.000000 | 55.000000 | 1024.475000 | 1021.475000 | 7.000000 | 7.000000 | 17.000000 | 24.000000 | 0.200000 |
| max | 20.900000 | 35.800000 | 39.800000 | 13.800000 | 13.600000 | 98.000000 | 41.000000 | 52.000000 | 99.000000 | 96.000000 | 1035.700000 | 1033.200000 | 8.000000 | 8.000000 | 24.700000 | 34.500000 | 39.800000 |
In [5]:
df.shape
Out[5]:
(366, 22)
In [6]:
df.columns
Out[6]:
Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
dtype='object')
In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(df)
plt.show()
In [8]:
plt.figure(figsize=(8,5))
plt.scatter(df['MaxTemp'], df['Humidity3pm'])
plt.xlabel('Max Temperature')
plt.ylabel('Humidity at 3pm')
plt.title('Max Temperature vs Humidity (3pm)')
plt.show()
In [9]:
sns.pairplot(df[['MinTemp', 'MaxTemp', 'Rainfall']])
plt.show()
In [10]:
plt.figure(figsize=(8,5))
plt.scatter(df['MinTemp'], df['Humidity9am'])
plt.xlabel('Min Temperature')
plt.ylabel('Humidity at 9am')
plt.title('Min Temperature vs Humidity (9am)')
plt.show()
In [11]:
numeric_df = df.select_dtypes(include='number')
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap of Weather Data")
plt.show()
In [12]:
# Select only numeric columns
numeric_df = df.select_dtypes(include='number')
# Separate features and target
X = numeric_df.drop('Rainfall', axis=1)
y = numeric_df['Rainfall']
X.shape, y.shape
Out[12]:
((366, 16), (366,))
In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape
Out[13]:
((292, 16), (74, 16))
In [15]:
numeric_df.isnull().sum()
Out[15]:
MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 0 Sunshine 3 WindGustSpeed 2 WindSpeed9am 7 WindSpeed3pm 0 Humidity9am 0 Humidity3pm 0 Pressure9am 0 Pressure3pm 0 Cloud9am 0 Cloud3pm 0 Temp9am 0 Temp3pm 0 RISK_MM 0 dtype: int64
In [16]:
import numpy as np
np.isinf(numeric_df).sum()
Out[16]:
MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 0 Sunshine 0 WindGustSpeed 0 WindSpeed9am 0 WindSpeed3pm 0 Humidity9am 0 Humidity3pm 0 Pressure9am 0 Pressure3pm 0 Cloud9am 0 Cloud3pm 0 Temp9am 0 Temp3pm 0 RISK_MM 0 dtype: int64
In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
model = LinearRegression()
model.fit(X_train_imputed, y_train)
y_pred = model.predict(X_test_imputed)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2, mse
Out[18]:
(0.19041298786157834, 34.2256782862109)
In [21]:
X = df[['MinTemp', 'MaxTemp']]
y = df['Rainfall']
In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [24]:
model = LinearRegression()
model.fit(X_train, y_train)
Out[24]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| fit_intercept | True | |
| copy_X | True | |
| tol | 1e-06 | |
| n_jobs | None | |
| positive | False |
In [25]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error for Rainfall Prediction: {mse}')
Mean Squared Error for Rainfall Prediction: 37.0768456005826
In [27]:
df.groupby("RainToday")["MaxTemp"].mean()
Out[27]:
RainToday No 20.756667 Yes 19.612121 Name: MaxTemp, dtype: float64
In [28]:
df.groupby("RainTomorrow")["MaxTemp"].mean()
Out[28]:
RainTomorrow No 20.396000 Yes 21.251515 Name: MaxTemp, dtype: float64